CD7: full spectrum dimensionality reduction of neutrophils¶

In this notebook, we perform a clustering analysis of the CD7 data on features extracted from SCIP.

In [213]:
%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
In [214]:
import warnings
In [215]:
from scip_workflows.common import *
In [216]:
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif
from sklearn.preprocessing import scale
import anndata
import scanpy
scanpy.settings.verbosity = 3

from sklearn.preprocessing import scale
import flowutils

import scipy.stats
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import fcluster
from scipy.spatial.distance import squareform

from kneed import KneeLocator
In [217]:
from scip.features import intensity
props = intensity.props.copy()
props.remove("kurtosis")
props.remove("skewness")
In [218]:
def asinh_scale(x, t):
    return scale(flowutils.transforms.asinh(x, channel_indices=None, t=t, m=4.5, a=1), with_std=False)
In [219]:
plt.rcParams['figure.dpi'] = 150

Data¶

In [170]:
try:
    features = snakemake.input.features
    index = snakemake.input.index
    columns = snakemake.input.columns
    fillna = bool(int(snakemake.wildcards.fillna))
    output = snakemake.output[0]
except NameError:
    # data_dir = Path("/data/gent/vo/000/gvo00070/vsc42015/datasets/cd7/800/results/scip/202203221745/")
    data_dir = Path("/home/maximl/scratch/data/vsc/datasets/cd7/800/scip/061020221736/")
    features = data_dir / "features.parquet"
    index = data_dir / "indices" / "index.npy"
    columns = data_dir / "indices" / "columns.npy"
    fillna = False
    output = data_dir / f"adata_{int(fillna)}.h5ad"
In [171]:
df = pq.read_table(features).to_pandas()
df = df.set_index(["meta_panel", "meta_replicate", "meta_P", "meta_id"])
df = df.loc["D"]
df = df[[c for c in numpy.load(columns, allow_pickle=True) if c in df.columns]]
df = df.loc[numpy.load(index, allow_pickle=True)]
df = df.sort_index()

df.shape
Out[171]:
(29927, 1188)

Removing zero variance features¶

from sklearn.feature_selection import VarianceThreshold

In [172]:
var = VarianceThreshold().fit(df.filter(regex="feat"))
In [173]:
df = pandas.concat([
    df.filter(regex="feat").iloc[:, var.get_support()],
    df.filter(regex="meta")
], axis=1)

NaN values¶

In [174]:
df.isna().all(axis=0).any()
Out[174]:
False
In [175]:
df.filter(regex="feat").isna().all(axis=1).sum()
Out[175]:
0
In [176]:
seaborn.histplot(data=df.isna().sum())
Out[176]:
<AxesSubplot:ylabel='Count'>
In [177]:
if fillna:
    df = df.fillna(0)
else:
    df = df.drop(columns=df.columns[df.isna().sum() > 0])

Analysis¶

In [178]:
obs = df.filter(regex='meta').reset_index()
obs.index = df.index
adata = anndata.AnnData(df.filter(regex="feat").astype(numpy.float32), obs=obs)
adata.raw = adata.copy()

adata.obs["meta_replicate"] = adata.obs["meta_replicate"].astype("category")
In [180]:
markers = [col for col in adata.var.index if col.startswith("feat_sum")]
In [181]:
adata_pre = adata.copy()
In [182]:
%%time
scanpy.pp.scale(adata_pre)
scanpy.tl.pca(adata_pre, svd_solver='arpack')
scanpy.pp.neighbors(adata_pre, n_neighbors=30)
scanpy.tl.umap(adata_pre)
scanpy.pl.umap(adata_pre, color=["meta_replicate"])
computing PCA
    with n_comps=50
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:08)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:45)
CPU times: user 2min, sys: 11.8 s, total: 2min 12s
Wall time: 55.6 s
In [183]:
discrete = ["median", "area", "euler"]
discrete_cols = [c for c in adata.var_names if any(d in c for d in discrete)]
discrete_cols_i = [i for i, c in enumerate(adata.var_names) if any(d in c for d in discrete)]
In [184]:
adata.var["is_marker"] = [any(n.endswith("feat_combined_sum_%s" % m) for m in ["DAPI", "EGFP", "RPe", "APC"]) for n in adata.var_names]
adata.var["do_asinh"] = [(any(m in n for m in ["DAPI", "EGFP", "RPe", "APC"]) and any(o in n for o in props)) for n in adata.var_names]
In [185]:
%%time
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    sc_df = scanpy.get.obs_df(adata, keys=adata.var_names.to_list())
    sc_df[adata.var_names[adata.var.do_asinh].to_list()] = sc_df[adata.var_names[adata.var.do_asinh]].groupby(["meta_replicate", "meta_P"]).transform(lambda x: asinh_scale(x, x.max()))
    sc_df[adata.var_names[~adata.var.do_asinh].to_list()] = sc_df[adata.var_names[~adata.var.do_asinh]].groupby(["meta_replicate", "meta_P"]).transform(lambda x: scale(x))

adata = anndata.AnnData(X=sc_df, obs=adata.obs, var=adata.var, raw=adata.raw)
CPU times: user 1min 26s, sys: 3.08 s, total: 1min 29s
Wall time: 1min 28s
In [186]:
def map_names(a):
    return {
        "feat_combined_sum_DAPI": "DAPI",
        "feat_combined_sum_EGFP": "CD45",
        "feat_combined_sum_RPe": "Siglec 8",
        "feat_combined_sum_APC": "CD15"
    }[a]
In [187]:
aligned_df = scanpy.get.obs_df(adata, keys=adata.var_names[adata.var.is_marker].to_list()).reset_index()

melted_df = pandas.melt(aligned_df, id_vars=["meta_P", "meta_replicate"], value_vars=adata.var_names[adata.var.is_marker].to_list())
melted_df.variable = melted_df.variable.apply(map_names)

grid = seaborn.FacetGrid(data=melted_df, col="meta_replicate", row="variable", sharey="row", aspect=1.5, margin_titles=True)
grid.map_dataframe(seaborn.stripplot, x="meta_P", y="value", size=1, alpha=0.5)

grid.set_axis_labels("Well image position", "Fluorescence intensity")
grid.set_titles(col_template="Replicate {col_name}", row_template="{row_name}")

grid.add_legend()

# plt.savefig(data_dir / "figures/qc_intensity_distribution_post.png", bbox_inches='tight', pad_inches=0)
Out[187]:
<seaborn.axisgrid.FacetGrid at 0x7f0ff9662d30>
In [188]:
def rediscritize(v):
    bin_idx = numpy.digitize(v, bins=numpy.histogram_bin_edges(v))
    bin2mu = [numpy.mean(v[bin_idx == i]) for i in range(1, numpy.max(bin_idx)+1)]
    return numpy.fromiter((bin2mu[i-1] for i in bin_idx), dtype=float)
In [189]:
adata = adata[adata.to_df().isna().sum(axis=1) == 0].copy()
In [190]:
X = adata.X.copy()
X[:, discrete_cols_i] = numpy.apply_along_axis(rediscritize, 0, X[:, discrete_cols_i])
/srv/scratch/maximl/mambaforge/envs/scip/lib/python3.9/site-packages/numpy/core/fromnumeric.py:3474: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/srv/scratch/maximl/mambaforge/envs/scip/lib/python3.9/site-packages/numpy/core/_methods.py:189: RuntimeWarning: invalid value encountered in true_divide
  ret = ret.dtype.type(ret / rcount)
In [191]:
%%time
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    mi_post = mutual_info_classif(X=X, y=adata.obs["meta_replicate"], discrete_features=discrete_cols_i, n_neighbors=30, random_state=0)
    mi_post = pandas.Series(mi_post, index=adata.var_names).sort_values(ascending=False)
CPU times: user 4min 12s, sys: 8.11 s, total: 4min 20s
Wall time: 4min 20s
In [192]:
kneedle = KneeLocator(numpy.arange(len(mi_post)), mi_post, S=40, curve='convex', direction="decreasing",online=False)
elbow_value = mi_post.iloc[int(kneedle.knee)]

kneedle.plot_knee()
In [220]:
selected_mi = mi_post[mi_post < elbow_value].index.values
len(selected_mi) / len(mi_post)
Out[220]:
0.8407643312101911
In [194]:
len(mi_post[mi_post > elbow_value].index.values)
Out[194]:
149
In [195]:
adata2 = adata[:, selected_mi].copy()

Feature clustering¶

In [197]:
corr = scipy.stats.spearmanr(adata2.X).correlation
seaborn.clustermap(corr, vmin=-1, vmax=1, center=0)
/srv/scratch/maximl/mambaforge/envs/scip/lib/python3.9/site-packages/seaborn/matrix.py:657: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
Out[197]:
<seaborn.matrix.ClusterGrid at 0x7f0ff8e89e50>
In [198]:
corr = (corr + corr.T) / 2
numpy.fill_diagonal(corr, 1)

def feat_from_corr(corr):
    # We convert the correlation matrix to a distance matrix before performing
    # hierarchical clustering using Ward's linkage.
    distance_matrix = 1 - numpy.abs(corr)
    dist_linkage = hierarchy.ward(squareform(distance_matrix))

    clusters = fcluster(dist_linkage, 0.1, criterion="distance").ravel()

    indices = []
    for c in numpy.unique(clusters):
        ci = numpy.flatnonzero(clusters == c)
        indices.append(ci[adata2.X[:, ci].std(axis=0).argmax()])

    features = [adata2.var_names[i] for i in indices]
    return features, indices, clusters

features, indices, clusters = feat_from_corr(corr)
In [199]:
seaborn.clustermap(corr[indices, :][:, indices], vmin=-1, vmax=1, center=0)
Out[199]:
<seaborn.matrix.ClusterGrid at 0x7f0ff90a3070>
In [200]:
adata2.var["selected_corr"] = False
adata2.var.loc[features, "selected_corr"] = True
adata2.var["feature_clusters"] = clusters
In [201]:
scanpy.pp.scale(adata2)
In [203]:
adata3 = adata2[:, adata2.var.selected_corr].copy()
In [205]:
scanpy.tl.pca(adata3, svd_solver='arpack', random_state=0)
scanpy.pp.neighbors(adata3, n_neighbors=30, method="umap", random_state=0)
computing PCA
    with n_comps=50
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:10)
In [206]:
%%time
resolutions = [0.1, 0.25, 0.5, 0.75, 1]
for res in resolutions:
    scanpy.tl.leiden(adata3, resolution=res, key_added=f"leiden_{res}", random_state=0)
running Leiden clustering
    finished: found 3 clusters and added
    'leiden_0.1', the cluster labels (adata.obs, categorical) (0:00:07)
running Leiden clustering
    finished: found 4 clusters and added
    'leiden_0.25', the cluster labels (adata.obs, categorical) (0:00:10)
running Leiden clustering
    finished: found 7 clusters and added
    'leiden_0.5', the cluster labels (adata.obs, categorical) (0:00:36)
running Leiden clustering
    finished: found 10 clusters and added
    'leiden_0.75', the cluster labels (adata.obs, categorical) (0:00:26)
running Leiden clustering
    finished: found 12 clusters and added
    'leiden_1', the cluster labels (adata.obs, categorical) (0:00:25)
CPU times: user 1min 43s, sys: 3.64 s, total: 1min 47s
Wall time: 1min 46s
In [207]:
scanpy.tl.umap(adata3, random_state=0)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:45)
In [208]:
scanpy.pl.umap(adata3, color=["leiden_0.25", "leiden_0.5", "leiden_0.75", "leiden_1", "meta_replicate"], legend_loc='on data')
In [209]:
adata3.obs["leiden"] = adata3.obs["leiden_0.75"]
In [210]:
seaborn.countplot(data=adata3.obs, x="leiden", hue="meta_replicate")
Out[210]:
<AxesSubplot:xlabel='leiden', ylabel='count'>
In [211]:
adata_out = anndata.AnnData(
    X=adata2.X,
    var=adata2.var,
    obs=adata3.obs,
    obsm=adata3.obsm,
    raw=adata2.raw
)
In [212]:
import pickle
with open(data_dir / "adata.pickle", "wb") as fh:
    pickle.dump(adata_out, fh)
In [ ]: